Laboratório: UKgrid

Utilizaremos o conjunto de dados UKgrid do pacote UKgrid do R para ilustrar como podemos utilizar o modelo de regressão linear para lidar com problemas de multiplos períodos sazonais. Nosso objetivo é fazer previsão para os próximos dias
Autor

Carlos Trucíos

Pacotes

library(UKgrid)
library(TSstudio)
library(dplyr)
library(forecast)
library(lubridate)
library(plotly)

Dados

data("UKgrid")
glimpse(UKgrid)
Rows: 254,592
Columns: 9
$ TIMESTAMP                 <dttm> 2005-04-01 00:00:00, 2005-04-01 00:30:00, 2…
$ ND                        <int> 32926, 32154, 33633, 34574, 34720, 34452, 33…
$ I014_ND                   <int> 32774, 32032, 33495, 34460, 34641, 34347, 33…
$ TSD                       <int> 34286, 33885, 35082, 36028, 36179, 35921, 35…
$ ENGLAND_WALES_DEMAND      <int> 29566, 28871, 30340, 31253, 31325, 31094, 30…
$ EMBEDDED_WIND_GENERATION  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ EMBEDDED_WIND_CAPACITY    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ EMBEDDED_SOLAR_GENERATION <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ EMBEDDED_SOLAR_CAPACITY   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
UKdaily <- extract_grid(type = "data.frame", columns = "ND", aggregate = "daily")
head(UKdaily)
   TIMESTAMP      ND
1 2005-04-01 1920069
2 2005-04-02 1674699
3 2005-04-03 1631352
4 2005-04-04 1916693
5 2005-04-05 1952082
6 2005-04-06 1964584
ts_plot(UKdaily, title = "Demanda Nacional Diária de Energia no Reino Unido", Ytitle = "MW", Xtitle = "Ano")

EDA

  • Tendência?
  • Sazonalidade?
    • Qual o período sazonal?
    • exisem outros possíveis períodos sazonais?
ts_heatmap(UKdaily[which(year(UKdaily$TIMESTAMP) >= 2015),],
title = "Mapa de calor da demanda nacional diária (detrended) de energia no Reino Unido")
UKdaily_df <- UKdaily |> 
  mutate(weekday = wday(TIMESTAMP, label = TRUE),
         month = month(TIMESTAMP, label = TRUE),
         lag365 = dplyr::lag(ND, 365))
UKdaily_summarise <- UKdaily_df |> group_by(weekday, month) |> 
  summarise(Media = mean(ND, na.rm = TRUE))
plot_ly(data = UKdaily_summarise, x = ~month, y = ~Media, type =
"bar",color = ~weekday) |> 
  layout(title = "Demanda média diária de energia por mês")
str(UKdaily_df)
'data.frame':   5304 obs. of  5 variables:
 $ TIMESTAMP: Date, format: "2005-04-01" "2005-04-02" ...
 $ ND       : int  1920069 1674699 1631352 1916693 1952082 1964584 1990895 2003982 1811436 1684720 ...
 $ weekday  : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 6 7 1 2 3 4 5 6 7 1 ...
 $ month    : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 4 4 4 4 4 4 4 4 ...
 $ lag365   : int  NA NA NA NA NA NA NA NA NA NA ...
start_date <- min(UKdaily_df$TIMESTAMP)
start <- c(year(start_date), yday(start_date))
UK_ts <- ts(UKdaily_df$ND, start = start, frequency = 365)
acf(UK_ts, lag.max = 365*6)

Modelo

Apenas para poder comparar observados e estimado, vamos dividir a série:

h <- 365
UKpartitions <- ts_split(UK_ts, sample.out = h)
train_ts <- UKpartitions$train
test_ts <- UKpartitions$test
train_df <- UKdaily_df[1:(nrow(UKdaily_df) - h), ]
test_df <- UKdaily_df[(nrow(UKdaily_df) - h + 1):nrow(UKdaily_df), ]
modelo_tslm <- tslm(train_ts ~ season + trend + weekday + month , data = train_df)
fore <- forecast(modelo_tslm, h = h, newdata = test_df)
test_forecast(actual = UK_ts, forecast.obj = fore, test = test_ts)
accuracy(fore , test_ts)
                        ME     RMSE      MAE        MPE     MAPE      MASE
Training set -7.233089e-11 70611.20 52773.55 -0.1675864 3.174944 0.4396604
Test set     -1.891129e+04 82845.32 66699.81 -1.4303754 4.771410 0.5556811
                  ACF1 Theil's U
Training set 0.7539041        NA
Test set     0.6152629 0.6998989
modelo_tslm_2 <- tslm(train_ts ~ trend + weekday + month , data = train_df)
fore_2 <- forecast(modelo_tslm_2, h = h, newdata = test_df)
accuracy(fore_2 , test_ts)
                        ME     RMSE      MAE        MPE     MAPE      MASE
Training set -5.226344e-11 87550.26 62678.62 -0.2555417 3.756523 0.5221803
Test set     -1.833883e+04 95112.70 74873.84 -1.5121833 5.331113 0.6237796
                  ACF1 Theil's U
Training set 0.8035804        NA
Test set     0.7091169 0.8029323
modelo_tslm_3 <- tslm(train_ts ~ season + trend + weekday + month + lag365, data = train_df)
fore_3 <- forecast(modelo_tslm_3, h = h, newdata = test_df)
accuracy(fore_3 , test_ts)
                        ME     RMSE      MAE        MPE     MAPE      MASE
Training set -7.603646e-11 69904.92 52006.75 -0.1717563 3.163385 0.4359116
Test set     -1.754061e+04 81783.55 65957.66 -1.3613252 4.722083 0.5528457
                  ACF1 Theil's U
Training set 0.7500146        NA
Test set     0.6094666 0.6925767

Modelo para produçãp

modelo_final <- tslm(UK_ts ~ season + trend + weekday + month + lag365, data = UKdaily_df)
checkresiduals(modelo_final, plot = TRUE)


    Breusch-Godfrey test for serial correlation of order up to 730

data:  Residuals from Linear regression model
LM test = 3301.1, df = 730, p-value < 2.2e-16

O modelo não captura apropriadamente a dinâmica dos dados. Algumas alternativas são incluir variáveis exógenas ou, após utilizar o modelo de regressão, modelar os resíduos por um modelo da ARIMA